Preambule

Analysis following previously published work on Brazilian data, using the benford.analysis R package.

Conclusions

To be completed.

Packages and global variables

clean_folder <- "~/Dropbox/aaa/studies/oucomo/clean data/"
library(readxl)
library(magrittr)
library(dplyr)
library(tidyr)
library(purrr)
library(benford.analysis)
library(tibble)
library(lubridate)
library(sf)
library(stringr)
library(stringi)

Utilitary functions

A function to extract the statistics of a Benford analysis we want to look at:

extract_statistics <- function(x) {
  with(x, c(with(info, c(n, n.second.order)),
            stats$chisq$statistic,
            MAD,
            MAD.conformity,
            distortion.factor,
            stats$mantissa.arc.test$statistic))
}

A function to make a table of statistics (in column) for each of the provinces (in row):

make_statistics_table <- function(x) {
  x %>% 
    map_df(extract_statistics) %>% 
    t() %>% 
    as.data.frame() %>% 
    rownames_to_column() %>% 
    setNames(c("province", "n", "n2", "Chisq", "MAD", "MAD conformity", "DF", "Mantissa")) %>% 
    as_tibble() %>% 
    mutate_at(vars(starts_with("n")), as.integer) %>% 
    mutate_at(vars(MAD, DF, Chisq, Mantissa), as.numeric)
}

A function that plots the diagnostic plots for each of the provinces:

plot_digits_distribution <- function(x, y) {
  xmarks <- barplot(x[["bfd"]]$data.dist.freq, col = 4,
                    xlab = "digits", ylab = "frequency", 
                    ylim = c(0, 1.1 * max(c(x[["bfd"]]$data.dist.freq,
                                            x[["bfd"]]$benford.dist.freq))))
  axis(1, at = xmarks, labels = x[["bfd"]]$digits)
  lines(xmarks, x[["bfd"]]$benford.dist.freq, lwd = 2, col = 2)
  title(y, line = -1)
}

Data

Vietnam 2019 census data

The 2019 census data (Vinh Long is missing):

census <- paste0(clean_folder, "census2019.rds") %>%
  readRDS() %>% 
  group_by(province) %>% 
  summarise(popsize = sum(n)) %>% 
  mutate_at("province", stri_trans_general, "Latin-ASCII") %>% 
  mutate_at("province", str_remove, "Thanh pho |Tinh ") %>% 
  bind_rows(tibble(province = "Vinh Long", popsize = 1141677))

Geographic data of Vietnam

Downloading the file if not in the folder:

if(!file.exists("gadm36_VNM_1_sf.rds"))
  download.file("https://biogeo.ucdavis.edu/data/gadm3.6/Rsf/gadm36_VNM_1_sf.rds",
                "gadm36_VNM_1_sf.rds")

Loading the data:

gadm <- "gadm36_VNM_1_sf.rds" %>%
  readRDS() %>% 
  st_set_crs(4326) %>% 
  transmute(province = VARNAME_1, geometry) %>% 
  mutate(centroid    = map(geometry, st_centroid),
         coordinates = map(centroid, st_coordinates),
         latitude    = map(coordinates, extract2, 2)) %>% 
  left_join(census, "province")

Rogier’s COVID-19 data

Reading the data:

rogier <-  paste0(clean_folder, "Fourth cluster First wave.xlsx") %>% 
  read_excel() %>% 
  rename(date = `...1`)

Getting rid off whatever is below the Total row:

rogier %<>% head(which(rogier$date == "Total") - 1)

Reformatting the data:

rogier %<>% 
  select(date, `An Giang`:last_col()) %>% 
  filter(! is.na(date)) %>% 
  mutate_all(replace_na, 0) %>% 
  mutate_all(as.integer) %>% 
  mutate_at("date", as.Date, origin = "1899-12-30")

which looks like:

rogier
## # A tibble: 339 x 64
##    date       `An Giang` `Ba Ria - Vung Tau` `Bac Giang` `Bac Kan` `Bac Lieu`
##    <date>          <int>               <int>       <int>     <int>      <int>
##  1 2020-01-23          0                   0           0         0          0
##  2 2020-02-01          0                   0           0         0          0
##  3 2020-02-03          0                   0           0         0          0
##  4 2020-02-04          0                   0           0         0          0
##  5 2020-02-06          0                   0           0         0          0
##  6 2020-02-07          0                   0           0         0          0
##  7 2020-02-09          0                   0           0         0          0
##  8 2020-02-11          0                   0           0         0          0
##  9 2020-02-13          0                   0           0         0          0
## 10 2020-03-06          0                   0           0         0          0
## # … with 329 more rows, and 58 more variables: Bac Ninh <int>, Ben Tre <int>,
## #   Binh Dinh <int>, Binh Duong <int>, Binh Phuoc <int>, Binh Thuan <int>,
## #   Ca Mau <int>, Can Tho <int>, Cao Bang <int>, Da Nang <int>, Dak Lak <int>,
## #   Dak Nong <int>, Dien Bien <int>, Dong Nai <int>, Dong Thap <int>,
## #   Gia Lai <int>, Ha Giang <int>, Ha Nam <int>, Ha Noi <int>, Ha Tinh <int>,
## #   Hai Duong <int>, Hai Phong <int>, Hau Giang <int>, Ho Chi Minh <int>,
## #   Hoa Binh <int>, Hue <int>, Hung Yen <int>, Khanh Hoa <int>,
## #   Kien Giang <int>, Kon Tum <int>, Lai Chau <int>, Lam Dong <int>,
## #   Lang Son <int>, Lao Cai <int>, Long An <int>, Nam Dinh <int>,
## #   Nghe An <int>, Ninh Binh <int>, Ninh Thuan <int>, Phu Tho <int>,
## #   Phu Yen <int>, Quang Binh <int>, Quang Nam <int>, Quang Ngai <int>,
## #   Quang Ninh <int>, Quang Tri <int>, Soc Trang <int>, Son La <int>,
## #   Tay Ninh <int>, Thai Binh <int>, Thai Nguyen <int>, Thanh Hoa <int>,
## #   Tien Giang <int>, Tra Vinh <int>, Tuyen Quang <int>, Vinh Long <int>,
## #   Vinh Phuc <int>, Yen Bai <int>

For compatibility of other data sets, we want to rename Hue into Thua Thien Hue:

names(rogier) <- str_replace(names(rogier), "Hue", "Thua Thien Hue")

NCSC’s COVID-19 data

Data automatically collected from NCSC:

ncsc <- readRDS(paste0(clean_folder, "NCSC data/covid.rds")) %>% 
  filter(var == "incidence") %>% 
  select(-var) %>% 
  mutate_at("province", str_replace, "TP HCM", "Ho Chi Minh")

which looks like this:

ncsc
## # A tibble: 37,189 x 3
##    date       province              n
##    <date>     <chr>             <int>
##  1 2020-03-18 An Giang              0
##  2 2020-03-18 Ba Ria - Vung Tau     0
##  3 2020-03-18 Bac Giang             0
##  4 2020-03-18 Bac Kan               0
##  5 2020-03-18 Bac Lieu              0
##  6 2020-03-18 Bac Ninh              0
##  7 2020-03-18 Ben Tre               0
##  8 2020-03-18 Binh Dinh             0
##  9 2020-03-18 Binh Duong            0
## 10 2020-03-18 Binh Phuoc            0
## # … with 37,179 more rows

Comparing Rogier’s and NCSC data

A long format of Rogier’s data:

rogier_long <- pivot_longer(rogier, -date, "province", values_to = "n_rogier")

Merging Rogier’s and NCSC’s data:

two_sources <- rogier_long %>% 
  full_join(ncsc, c("date", "province")) %>% 
  mutate_at(vars(starts_with("n")), replace_na, 0) %>% 
  filter(n_rogier > 0 | n > 0)

Plots to compare the two sources of data:

par(mfrow = c(1, 3), plt = c(.15, .9, .17, .9))

two_sources %$% 
  plot(n_rogier, n, xlab = "Rogier's data", ylab = "NCSC data",
       col = adjustcolor(4, .5))
abline(0, 1)

two_sources %$% 
  plot(n_rogier, n, xlab = "Rogier's data", ylab = "NCSC data",
       col = adjustcolor(4, .5), xlim = c(0, 9000), ylim = c(0, 9000))
abline(0, 1)

two_sources %>% 
  mutate_at(vars(starts_with("n")), ~ .x + .1) %$% 
  plot(n_rogier, n, xlab = "Rogier's data", ylab = "NCSC data",
       col = adjustcolor(4, .5), log = "xy")
abline(0, 1)

Comparing the dates of zero-counts for the two data sets:

two_sources %>% 
  filter(n < 1) %>% 
  pull(date) %>% 
  plot(., jitter(rep(1, length(.))), axes = FALSE, col = 2, ann = FALSE,
       xlim = ymd(c(20200101, 20211231)), ylim = c(.8, 1.1))

two_sources %>% 
  filter(n_rogier < 1) %>% 
  pull(date) %>% 
  points(., jitter(rep(.9, length(.))), col = 4)

month_val <- seq(1, 12, 2)
axis(1,
     ydm(paste0("2020-1-", month_val), paste0("2021-1-", month_val), "2022-1-1"),
     c(paste0(month.abb[month_val], "-20"), paste0(month.abb[month_val], "-21"), "Jan-22"))

month_val2 <- 1:12
abline(v = ydm(paste0("2020-1-", month_val2), paste0("2021-1-", month_val2), "2022-1-1")[-c(1, 2)])

text(ymd(20200115), 1, "NCSC", col = 2)
text(ymd(20200115), .9, "Rogier", col = 4)

The distribution of the counts in one data set when it’s 0 in the other data set:

par(mfrow = c(1, 2), plt = c(.15, .95, .17, .95))

hist2 <- function(..., color) {
  hist(..., n = 1000, col = color, border = color, yaxs = "i",
       main = NA, ylab = "frequency")
}

two_sources %>% 
  filter(n_rogier < 1) %>% 
  pull(n) %>% 
  hist2(color = 4, xlab = "value in NCSC when 0 count in Rogier")

two_sources %>% 
  filter(n < 1) %>% 
  pull(n_rogier) %>% 
  hist2(color = 2, xlab = "value in Rogier when 0 count in NCSC")

A function to plot the time series from the 2 sources for 1 province:

plot_prov_ts <- function(prov, xlim = ymd(c(20200101, 20220101)), lwd = 1) {
  ncsc %>% 
    filter(province == prov) %$%
    plot(date, n, type = "l", col = 2, xlab = NA, ylab = "incidence",
         xlim = xlim, lwd = lwd)

  rogier_long %>% 
    filter(province == prov) %$%
    lines(date, n_rogier, col = 4, lty = 2, lwd = lwd)
}

Comparing the time series of the two data sets for Ho Chi Minh city:

plot_prov_ts("Ho Chi Minh", ymd(c(20210601, 20220101)), lwd = 2)
legend("topright", legend = c("NCSC", "Rogier"),
       col = c(2, 4), lty = 1:2, bty = "n", lwd = 2)

Comparing for all the provinces:

par(mfrow = c(32, 2), plt = c(.2, .97, .17, .9))

ncsc %>% 
  pull(province) %>% 
  unique() %>% 
  sort() %>% 
  walk(~ {plot_prov_ts(.x); title(.x, line = -1)})

Analyzing Rogier’s data

All the data

Removing the dates and filtering out the zeros:

rogier1 <- rogier %>% 
  select(-date) %>% 
  map(~ .x[.x > 0])

Performing the Benford analysis on each of the 63 provinces:

benford_analyses_rogier1 <- map(rogier1, benford, number.of.digits = 1)

Extracting all these statistics for each of the 63 provinces. Interpretation is as follows:

  • n: number of data points on which the statistics are calculated.
  • n2: number of data points on which the second order test is performed.
  • Chisq: classical chi-square statistic. Note though that it is highly sensitive to the sample size and tends to reject the null even for small departures from the expected distribution. MAD statistic is thus to be preferred.
  • MAD: Mean Absolute Deviation. The MAD test is more robust since it ignores the number of records. The higher the MAD, the larger the average difference between the observed and theoretical distributions. MAD values above 0.015 suggest nonconformity.
  • MAD conformity: interpretation of the MAD statistic value.
  • DF: Distortion Factor. The DF statistic examines the digit patterns to indicate whether the data appear to be underestimated or overestimated and the deformity’s magnitude.
  • Mantissa: expected to be 0.5. Values less than 0.5 suggest that figures tend to be manipulated downward whereas it’s the opposite for values higher than 0.5.
table_rogier1 <- make_statistics_table(benford_analyses_rogier1)
print(table_rogier1, n = Inf)
## # A tibble: 63 x 8
##    province        n    n2  Chisq     MAD `MAD conformity`           DF Mantissa
##    <chr>       <int> <int>  <dbl>   <dbl> <chr>                   <dbl>    <dbl>
##  1 An Giang      143   114   4.38 0.0145  Marginally acceptabl… -13.7   0.00177 
##  2 Ba Ria - V…   129    82  16.1  0.0342  Nonconformity          -5.98  0.00984 
##  3 Bac Giang     123    49  10.5  0.0243  Nonconformity         -15.0   0.00331 
##  4 Bac Kan        13     2  12.6  0.0996  Nonconformity         NaN     0.274   
##  5 Bac Lieu      124    68  19.4  0.0373  Nonconformity         -16.5   0.0390  
##  6 Bac Ninh      139    54   1.29 0.00899 Acceptable conformity -15.3   0.00597 
##  7 Ben Tre       125    73   9.08 0.0238  Nonconformity          -5.88  0.00707 
##  8 Binh Dinh     132    58  17.7  0.0341  Nonconformity         -37.5   0.0676  
##  9 Binh Duong    147   142  60.0  0.0550  Nonconformity          21.6   0.135   
## 10 Binh Phuoc    115    66  10.8  0.0297  Nonconformity         -29.9   0.0176  
## 11 Binh Thuan    121    86  44.0  0.0543  Nonconformity           7.89  0.139   
## 12 Ca Mau        112    68  12.3  0.0336  Nonconformity          -9.55  0.0392  
## 13 Can Tho       133    90  22.0  0.0364  Nonconformity           9.13  0.00876 
## 14 Cao Bang       23    15   8.82 0.0629  Nonconformity         -65.1   0.0746  
## 15 Da Nang       120    64   7.92 0.0205  Nonconformity          -0.793 0.0221  
## 16 Dak Lak       106    80  12.2  0.0329  Nonconformity         -11.8   0.0500  
## 17 Dak Nong      115    45  14.1  0.0312  Nonconformity          -4.75  0.0584  
## 18 Dien Bien      45    19   5.21 0.0322  Nonconformity         -44.4   0.0227  
## 19 Dong Nai      151   134 101.   0.0808  Nonconformity          50.6   0.277   
## 20 Dong Thap     142   102  35.5  0.0423  Nonconformity           8.72  0.0367  
## 21 Gia Lai       109    57   7.18 0.0203  Nonconformity          -1.53  0.00845 
## 22 Ha Giang       51    45  28.2  0.0797  Nonconformity          -6.27  0.202   
## 23 Ha Nam         90    31  30.5  0.0594  Nonconformity         -45.2   0.104   
## 24 Ha Noi        168    78   3.56 0.0127  Marginally acceptabl…  -2.41  0.0172  
## 25 Ha Tinh        99    29  17.1  0.0409  Nonconformity         -36.2   0.00290 
## 26 Hai Duong     112    36  15.2  0.0322  Nonconformity         -25.8   0.00757 
## 27 Hai Phong      45    17  11.2  0.0489  Nonconformity         -48.8   0.0412  
## 28 Hau Giang     104    62   5.61 0.0190  Nonconformity         -19.4   0.00799 
## 29 Ho Chi Minh   192   174  23.5  0.0308  Nonconformity          -5.19  0.00437 
## 30 Hoa Binh       32    15  12.8  0.0520  Nonconformity           5.36  0.000277
## 31 Thua Thien…   103    51  11.3  0.0299  Nonconformity         -15.2   0.0444  
## 32 Hung Yen       65    23  19.9  0.0547  Nonconformity         -42.9   0.112   
## 33 Khanh Hoa     137    90  16.8  0.0286  Nonconformity          -9.27  0.0351  
## 34 Kien Giang    137   103  30.1  0.0392  Nonconformity           6.00  0.0106  
## 35 Kon Tum        62    18   6.97 0.0298  Nonconformity         -60.5   0.00649 
## 36 Lai Chau       16     4  11.4  0.0755  Nonconformity         NaN     0.146   
## 37 Lam Dong       92    42   8.39 0.0244  Nonconformity         -14.7   0.00723 
## 38 Lang Son       77    16   3.86 0.0223  Nonconformity         -44.8   0.0370  
## 39 Lao Cai        49     7   8.75 0.0319  Nonconformity         -54.0   0.00321 
## 40 Long An       144   113  17.4  0.0282  Nonconformity          12.8   0.0564  
## 41 Nam Dinh       69    40  10.7  0.0401  Nonconformity          -3.09  0.0493  
## 42 Nghe An       124    63   6.02 0.0180  Nonconformity          -4.56  0.00912 
## 43 Ninh Binh      46    12   7.86 0.0353  Nonconformity         -61.7   0.0185  
## 44 Ninh Thuan    117    48  20.3  0.0350  Nonconformity         -10.4   0.0776  
## 45 Phu Tho        65    37  16.6  0.0435  Nonconformity           6.64  0.0671  
## 46 Phu Yen       153    47  10.9  0.0260  Nonconformity         -37.7   0.0181  
## 47 Quang Binh    104    49   9.00 0.0242  Nonconformity         -28.6   0.00553 
## 48 Quang Nam     108    51   8.04 0.0237  Nonconformity          -6.50  0.0171  
## 49 Quang Ngai    119    47  15.9  0.0285  Nonconformity         -26.9   0.00436 
## 50 Quang Ninh     54    27  13.2  0.0432  Nonconformity         -32.6   0.0500  
## 51 Quang Tri      88    27   3.57 0.0164  Nonconformity         -48.3   0.0144  
## 52 Soc Trang      82    67   6.18 0.0261  Nonconformity         -11.6   0.000278
## 53 Son La         66    18  15.0  0.0454  Nonconformity         -60.9   0.0710  
## 54 Tay Ninh      131   110  20.4  0.0307  Nonconformity           6.56  0.0342  
## 55 Thai Binh      62    26   5.63 0.0248  Nonconformity          20.6   0.0784  
## 56 Thai Nguyen    38    16   5.65 0.0306  Nonconformity          24.2   0.0156  
## 57 Thanh Hoa     117    48   9.67 0.0291  Nonconformity          -3.92  0.0191  
## 58 Tien Giang    139   118   8.39 0.0229  Nonconformity          -5.77  0.00817 
## 59 Tra Vinh      119    76   9.03 0.0239  Nonconformity          -9.04  0.00150 
## 60 Tuyen Quang    40    11  54.4  0.0942  Nonconformity         -26.6   0.178   
## 61 Vinh Long     137    74  24.1  0.0307  Nonconformity          -8.72  0.0264  
## 62 Vinh Phuc      60    31   8.82 0.0378  Nonconformity         -22.6   0.0265  
## 63 Yen Bai        28     8  18.5  0.0844  Nonconformity         -49.9   0.206

Plotting the diagnostic plots for each of the 63 provinces:

par(mfrow = c(16, 4), plt = c(.2, .97, .17, .9))
walk2(benford_analyses_rogier1,
      names(benford_analyses_rogier1),
      plot_digits_distribution)

Data from January 2021

Same analysis as above but with data from January 2021 only:

rogier2 <- rogier %>% 
  filter(date > ymd(20210101)) %>% 
  select(-date) %>% 
  map(~ .x[.x > 0])

benford_analyses_rogier2 <- map(rogier2, benford, number.of.digits = 1)

table_rogier2 <- make_statistics_table(benford_analyses_rogier2)
print(table_rogier2, n = Inf)
## # A tibble: 63 x 8
##    province        n    n2  Chisq     MAD `MAD conformity`           DF Mantissa
##    <chr>       <int> <int>  <dbl>   <dbl> <chr>                   <dbl>    <dbl>
##  1 An Giang      143   114   4.38 0.0145  Marginally acceptabl… -13.7   0.00177 
##  2 Ba Ria - V…   129    82  16.1  0.0342  Nonconformity          -5.98  0.00984 
##  3 Bac Giang     123    49  10.5  0.0243  Nonconformity         -15.0   0.00331 
##  4 Bac Kan        13     2  12.6  0.0996  Nonconformity         NaN     0.274   
##  5 Bac Lieu      124    68  19.4  0.0373  Nonconformity         -16.5   0.0390  
##  6 Bac Ninh      139    54   1.29 0.00899 Acceptable conformity -15.3   0.00597 
##  7 Ben Tre       125    73   9.08 0.0238  Nonconformity          -5.88  0.00707 
##  8 Binh Dinh     132    58  17.7  0.0341  Nonconformity         -37.5   0.0676  
##  9 Binh Duong    147   142  60.0  0.0550  Nonconformity          21.6   0.135   
## 10 Binh Phuoc    115    66  10.8  0.0297  Nonconformity         -29.9   0.0176  
## 11 Binh Thuan    121    86  44.0  0.0543  Nonconformity           7.89  0.139   
## 12 Ca Mau        112    68  12.3  0.0336  Nonconformity          -9.55  0.0392  
## 13 Can Tho       133    90  22.0  0.0364  Nonconformity           9.13  0.00876 
## 14 Cao Bang       23    15   8.82 0.0629  Nonconformity         -65.1   0.0746  
## 15 Da Nang       119    64   7.73 0.0203  Nonconformity          -0.793 0.0203  
## 16 Dak Lak       106    80  12.2  0.0329  Nonconformity         -11.8   0.0500  
## 17 Dak Nong      115    45  14.1  0.0312  Nonconformity          -4.75  0.0584  
## 18 Dien Bien      45    19   5.21 0.0322  Nonconformity         -44.4   0.0227  
## 19 Dong Nai      151   134 101.   0.0808  Nonconformity          50.6   0.277   
## 20 Dong Thap     142   102  35.5  0.0423  Nonconformity           8.72  0.0367  
## 21 Gia Lai       109    57   7.18 0.0203  Nonconformity          -1.53  0.00845 
## 22 Ha Giang       51    45  28.2  0.0797  Nonconformity          -6.27  0.202   
## 23 Ha Nam         90    31  30.5  0.0594  Nonconformity         -45.2   0.104   
## 24 Ha Noi        165    78   3.32 0.0117  Acceptable conformity  -2.41  0.0165  
## 25 Ha Tinh        99    29  17.1  0.0409  Nonconformity         -36.2   0.00290 
## 26 Hai Duong     112    36  15.2  0.0322  Nonconformity         -25.8   0.00757 
## 27 Hai Phong      45    17  11.2  0.0489  Nonconformity         -48.8   0.0412  
## 28 Hau Giang     104    62   5.61 0.0190  Nonconformity         -19.4   0.00799 
## 29 Ho Chi Minh   188   174  23.6  0.0316  Nonconformity          -5.19  0.00345 
## 30 Hoa Binh       32    15  12.8  0.0520  Nonconformity           5.36  0.000277
## 31 Thua Thien…   103    51  11.3  0.0299  Nonconformity         -15.2   0.0444  
## 32 Hung Yen       65    23  19.9  0.0547  Nonconformity         -42.9   0.112   
## 33 Khanh Hoa     137    90  16.8  0.0286  Nonconformity          -9.27  0.0351  
## 34 Kien Giang    137   103  30.1  0.0392  Nonconformity           6.00  0.0106  
## 35 Kon Tum        62    18   6.97 0.0298  Nonconformity         -60.5   0.00649 
## 36 Lai Chau       16     4  11.4  0.0755  Nonconformity         NaN     0.146   
## 37 Lam Dong       92    42   8.39 0.0244  Nonconformity         -14.7   0.00723 
## 38 Lang Son       77    16   3.86 0.0223  Nonconformity         -44.8   0.0370  
## 39 Lao Cai        49     7   8.75 0.0319  Nonconformity         -54.0   0.00321 
## 40 Long An       144   113  17.4  0.0282  Nonconformity          12.8   0.0564  
## 41 Nam Dinh       69    40  10.7  0.0401  Nonconformity          -3.09  0.0493  
## 42 Nghe An       124    63   6.02 0.0180  Nonconformity          -4.56  0.00912 
## 43 Ninh Binh      46    12   7.86 0.0353  Nonconformity         -61.7   0.0185  
## 44 Ninh Thuan    117    48  20.3  0.0350  Nonconformity         -10.4   0.0776  
## 45 Phu Tho        65    37  16.6  0.0435  Nonconformity           6.64  0.0671  
## 46 Phu Yen       153    47  10.9  0.0260  Nonconformity         -37.7   0.0181  
## 47 Quang Binh    104    49   9.00 0.0242  Nonconformity         -28.6   0.00553 
## 48 Quang Nam     108    51   8.04 0.0237  Nonconformity          -6.50  0.0171  
## 49 Quang Ngai    119    47  15.9  0.0285  Nonconformity         -26.9   0.00436 
## 50 Quang Ninh     54    27  13.2  0.0432  Nonconformity         -32.6   0.0500  
## 51 Quang Tri      88    27   3.57 0.0164  Nonconformity         -48.3   0.0144  
## 52 Soc Trang      82    67   6.18 0.0261  Nonconformity         -11.6   0.000278
## 53 Son La         66    18  15.0  0.0454  Nonconformity         -60.9   0.0710  
## 54 Tay Ninh      131   110  20.4  0.0307  Nonconformity           6.56  0.0342  
## 55 Thai Binh      62    26   5.63 0.0248  Nonconformity          20.6   0.0784  
## 56 Thai Nguyen    38    16   5.65 0.0306  Nonconformity          24.2   0.0156  
## 57 Thanh Hoa     117    48   9.67 0.0291  Nonconformity          -3.92  0.0191  
## 58 Tien Giang    139   118   8.39 0.0229  Nonconformity          -5.77  0.00817 
## 59 Tra Vinh      119    76   9.03 0.0239  Nonconformity          -9.04  0.00150 
## 60 Tuyen Quang    40    11  54.4  0.0942  Nonconformity         -26.6   0.178   
## 61 Vinh Long     137    74  24.1  0.0307  Nonconformity          -8.72  0.0264  
## 62 Vinh Phuc      60    31   8.82 0.0378  Nonconformity         -22.6   0.0265  
## 63 Yen Bai        28     8  18.5  0.0844  Nonconformity         -49.9   0.206
par(mfrow = c(16, 4), plt = c(.2, .97, .17, .9))
walk2(benford_analyses_rogier2,
      names(benford_analyses_rogier2),
      plot_digits_distribution)

Let’s compare with the table we get with all the data:

vars <- setdiff(names(table_rogier1), c("province", "MAD conformity"))
par(mfrow = c(2, 3), plt = c(.2, .97, .17, .9), cex = 1)
walk(vars, ~ {
  plot(table_rogier1[[.x]], table_rogier2[[.x]], col = 4,
       xlab = "all the data", ylab = "from Jan 2021")
  abline(0, 1)
  title(.x, line = -1)})

Specific comparisons

Group 1: Hai Duong and Quang Ninh between 1 January 2021 and 1 April 2021

group1 <- rogier %>% 
  transmute(date, n = `Hai Duong` + `Quang Ninh`) %>% 
  filter(ymd(20210101) < date, date < ymd(20210401)) %>% 
  select(-date) %>% 
  map(~ .x[.x > 0])

Group 2: Bac Giang and Bac Ninh between 1 April 2021 and 1 August 2021

group2 <- rogier %>% 
  transmute(date, n = `Bac Giang` + `Bac Ninh`) %>% 
  filter(ymd(20210401) < date, date < ymd(20210801)) %>% 
  select(-date) %>% 
  map(~ .x[.x > 0])

Group 3: HCMC and Mekong between 1 May 2021 and 15 October 2021

The fourth wave:

wave4 <- rogier %>% 
  filter(ymd(20210501) < date, date < ymd(20211015)) %>% 
  select(-date)

Merging with GADM data:

wave4gadm <- wave4 %>% 
  colSums() %>% 
  list() %>% 
  data.frame() %>% 
  setNames("n") %>% 
  rownames_to_column("province") %>% 
  left_join(gadm, ., "province") %>% 
  mutate(incidence = 100000 * n / popsize)

The south wave in the south:

wave4mekong <- wave4gadm %>% 
  filter(latitude < 12, incidence > 50)

Which looks like this (in red):

par(plt = c(0, 1, 0, 1))

gadm %>% 
  st_geometry() %>% 
  plot(col = "grey")

wave4mekong %>% 
  st_geometry() %>% 
  plot(add = TRUE, col = "red")

Group 3 is then:

group3 <- wave4 %>% 
  magrittr::extract(wave4mekong$province) %>% 
  rowSums() %>% 
  list()

Group 4: the whole country after 15 October 2021

group4 <- rogier %>% 
  filter(date > ymd(20211015)) %>% 
  select(-date) %>% 
  rowSums() %>% 
  list()

Let’s group the 4 groups into a list:

groups <- c(group1, group2, group3, group4) %>% 
  setNames(paste0("group", 1:4))

on which we can rerun the previous analyses:

benford_analyses_rogier_4groups <- map(groups, benford, number.of.digits = 1)

make_statistics_table(benford_analyses_rogier_4groups)
## # A tibble: 4 x 8
##   province     n    n2 Chisq    MAD `MAD conformity`     DF Mantissa
##   <chr>    <int> <int> <dbl>  <dbl> <chr>             <dbl>    <dbl>
## 1 group1      41    16 17.6  0.0465 Nonconformity    -30.7   0.00933
## 2 group2      62    37 14.0  0.0481 Nonconformity     -2.84  0.0852 
## 3 group3     137   125  8.38 0.0182 Nonconformity      3.28  0.0359 
## 4 group4      49    48 21.8  0.0563 Nonconformity     25.9   0.113
par(mfrow = c(1, 4), plt = c(.2, .97, .17, .9))
walk2(benford_analyses_rogier_4groups,
      names(benford_analyses_rogier_4groups),
      plot_digits_distribution)

Analyzing NCSC’s data

All the data

ncsc_wide <- pivot_wider(ncsc, names_from = province, values_from = n)
  
ncsc1 <- ncsc_wide %>% 
  select(-date) %>% 
  map(~ .x[.x > 0])

benford_analyses_ncsc1 <- map(ncsc1, benford, number.of.digits = 1)

table_ncsc1 <- make_statistics_table(benford_analyses_ncsc1)
print(table_ncsc1, n = Inf)
## # A tibble: 63 x 8
##    province         n    n2 Chisq    MAD `MAD conformity`            DF Mantissa
##    <chr>        <int> <int> <dbl>  <dbl> <chr>                    <dbl>    <dbl>
##  1 An Giang       161   117  9.71 0.0236 Nonconformity          -14.8   0.00693 
##  2 Ba Ria - Vu…   175    89 24.9  0.0389 Nonconformity           -9.39  0.0360  
##  3 Bac Giang      157    68 22.3  0.0358 Nonconformity          -17.9   0.0398  
##  4 Bac Kan         20     2 25.6  0.111  Nonconformity          NaN     0.521   
##  5 Bac Lieu       145    67 16.4  0.0326 Nonconformity          -18.4   0.0136  
##  6 Bac Ninh       173    64 10.5  0.0193 Nonconformity           -8.73  0.000392
##  7 Ben Tre        129    72  9.03 0.0223 Nonconformity           -3.03  0.0183  
##  8 Binh Dinh      139    58 17.8  0.0361 Nonconformity          -37.9   0.0616  
##  9 Binh Duong     187   153 29.1  0.0301 Nonconformity           14.8   0.0497  
## 10 Binh Phuoc     130    67  7.58 0.0225 Nonconformity          -29.5   0.00835 
## 11 Binh Thuan     143    91 34.0  0.0468 Nonconformity           10.5   0.110   
## 12 Ca Mau         127    68  9.68 0.0262 Nonconformity          -10.8   0.0192  
## 13 Can Tho        151    96 19.9  0.0328 Nonconformity           10.0   0.0172  
## 14 Cao Bang        22    14 10.7  0.0704 Nonconformity          -65.1   0.0963  
## 15 Da Nang        241    73 20.2  0.0303 Nonconformity           -9.88  0.0332  
## 16 Dak Lak        113    79 11.9  0.0306 Nonconformity          -11.5   0.0452  
## 17 Dak Nong       124    44 17.3  0.0328 Nonconformity           -6.79  0.0689  
## 18 Dien Bien       53    19  9.85 0.0446 Nonconformity          -46.0   0.0593  
## 19 Dong Nai       170   137 80.0  0.0677 Nonconformity           52.6   0.285   
## 20 Dong Thap      165   110 16.3  0.0271 Nonconformity            7.47  0.0217  
## 21 Gia Lai        130    57  8.33 0.0232 Nonconformity           -4.93  0.0112  
## 22 Ha Giang        60    47 26.3  0.0710 Nonconformity           -4.83  0.185   
## 23 Ha Nam         115    32 37.4  0.0582 Nonconformity          -44.4   0.0699  
## 24 Ha Noi         269    89 14.7  0.0245 Nonconformity           -9.59  0.0202  
## 25 Ha Tinh        139    28  9.86 0.0245 Nonconformity          -42.2   0.00962 
## 26 Hai Duong      162    40 20.3  0.0320 Nonconformity          -30.2   0.0155  
## 27 Hai Phong       62    16 13.9  0.0454 Nonconformity          -50.8   0.0302  
## 28 Hau Giang      110    64 10.2  0.0273 Nonconformity          -24.6   0.00171 
## 29 Hoa Binh        49    15 14.7  0.0480 Nonconformity           10.0   0.0121  
## 30 Hung Yen       122    23 28.9  0.0502 Nonconformity          -47.6   0.0933  
## 31 Khanh Hoa      197   101 20.0  0.0286 Nonconformity          -16.8   0.0216  
## 32 Kien Giang     172   112 25.2  0.0309 Nonconformity            5.99  0.0158  
## 33 Kon Tum         66    17  9.27 0.0361 Nonconformity          -60.6   0.00598 
## 34 Lai Chau        18     4 11.5  0.0792 Nonconformity          NaN     0.171   
## 35 Lam Dong       106    43 11.0  0.0252 Nonconformity          -21.0   0.00596 
## 36 Lang Son        99    16 11.2  0.0323 Nonconformity          -49.3   0.0542  
## 37 Lao Cai         61    10  9.10 0.0364 Nonconformity          -65.1   0.0125  
## 38 Long An        170   118 27.0  0.0316 Nonconformity           20.2   0.0963  
## 39 Nam Dinh        87    40  9.81 0.0272 Nonconformity           -3.10  0.0170  
## 40 Nghe An        147    65  9.71 0.0246 Nonconformity          -10.8   0.0156  
## 41 Ninh Binh       76    15  7.51 0.0300 Nonconformity          -59.4   0.0430  
## 42 Ninh Thuan     140    51 13.8  0.0268 Nonconformity          -14.9   0.0583  
## 43 Phu Tho         76    38 10.5  0.0325 Nonconformity            6.46  0.0201  
## 44 Phu Yen        160    59  5.14 0.0150 Marginally acceptable… -18.4   0.000545
## 45 Quang Binh     114    49  6.62 0.0191 Nonconformity          -30.4   0.00360 
## 46 Quang Nam      174    51 18.2  0.0318 Nonconformity          -18.7   0.0478  
## 47 Quang Ngai     150    48 10.3  0.0252 Nonconformity          -30.2   0.000540
## 48 Quang Ninh      82    29 15.2  0.0397 Nonconformity          -33.3   0.0194  
## 49 Quang Tri      101    27  4.97 0.0202 Nonconformity          -48.3   0.0198  
## 50 Soc Trang       95    72  5.30 0.0210 Nonconformity          -14.0   0.00126 
## 51 Son La          68    18 16.0  0.0439 Nonconformity          -60.9   0.0686  
## 52 Tay Ninh       182   119 12.8  0.0248 Nonconformity            0.376 0.0231  
## 53 Thai Binh       93    27  9.47 0.0326 Nonconformity           20.3   0.0526  
## 54 Thai Nguyen     50    15 14.5  0.0515 Nonconformity           11.6   0.0939  
## 55 Thanh Hoa      142    49  4.00 0.0160 Nonconformity           -8.51  0.00581 
## 56 Thua Thien …   121    50 15.8  0.0344 Nonconformity          -15.8   0.0538  
## 57 Tien Giang     153   124  9.03 0.0231 Nonconformity           -4.54  0.00698 
## 58 Ho Chi Minh    279   182 25.5  0.0289 Nonconformity            3.00  0.0161  
## 59 Tra Vinh       138    77  7.79 0.0192 Nonconformity           -3.34  0.00382 
## 60 Tuyen Quang     41    11 63.2  0.102  Nonconformity          -26.8   0.190   
## 61 Vinh Long      140    71 29.7  0.0322 Nonconformity           -7.78  0.0315  
## 62 Vinh Phuc       86    32  7.31 0.0272 Nonconformity          -28.8   0.0418  
## 63 Yen Bai         31     7 21.5  0.0875 Nonconformity          -49.5   0.235
par(mfrow = c(16, 4), plt = c(.2, .97, .17, .9))
walk2(benford_analyses_ncsc1,
      names(benford_analyses_ncsc1),
      plot_digits_distribution)

Specific comparisons

ncsc_group1 <- ncsc_wide %>% 
  transmute(date, n = `Hai Duong` + `Quang Ninh`) %>% 
  filter(ymd(20210101) < date, date < ymd(20210401)) %>% 
  select(-date) %>% 
  map(~ .x[.x > 0])

ncsc_group2 <- ncsc_wide %>% 
  transmute(date, n = `Bac Giang` + `Bac Ninh`) %>% 
  filter(ymd(20210401) < date, date < ymd(20210801)) %>% 
  select(-date) %>% 
  map(~ .x[.x > 0])

ncsc_group3 <- ncsc_wide %>% 
  filter(ymd(20210501) < date, date < ymd(20211015)) %>% 
  select(-date) %>% 
  magrittr::extract(wave4mekong$province) %>% 
  rowSums() %>% 
  list()

ncsc_group4 <- ncsc_wide %>% 
  filter(date > ymd(20211015)) %>% 
  select(-date) %>% 
  rowSums() %>% 
  list()

ncsc_groups <- c(ncsc_group1, ncsc_group2, ncsc_group3, ncsc_group4) %>% 
  setNames(paste0("group", 1:4))

benford_analyses_ncsc_4groups <- map(ncsc_groups, benford, number.of.digits = 1)

make_statistics_table(benford_analyses_ncsc_4groups)
## # A tibble: 4 x 8
##   province     n    n2 Chisq    MAD `MAD conformity`    DF Mantissa
##   <chr>    <int> <int> <dbl>  <dbl> <chr>            <dbl>    <dbl>
## 1 group1      48    24  8.28 0.0372 Nonconformity    -35.6   0.0317
## 2 group2      81    55 16.0  0.0386 Nonconformity    -29.7   0.0369
## 3 group3     156   136 36.7  0.0419 Nonconformity     18.2   0.156 
## 4 group4      48    47 25.8  0.0637 Nonconformity     26.9   0.120
par(mfrow = c(1, 4), plt = c(.2, .97, .17, .9))
walk2(benford_analyses_ncsc_4groups,
      names(benford_analyses_ncsc_4groups),
      plot_digits_distribution)